# coding=utf-8
import requests, json
from urllib import parse
import re,time

xpt = 'cHBhZzEwOTkwNDQyZDYzMEBzb2h1LmNvbQ=='  # 身份代码

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
}


def detailParse(url):
    try:
        response = requests.get(url, headers=header)
        res = re.findall('<article[\s\S]*</article>', response.text)
        html = res[0] if len(res) != 0 else None
        re_string = '<a href="//www.sohu.com/?strategyid=00001 " target="_blank" title="点击进入搜狐首页" id="backsohucom" style="white-space: nowrap;"><span class="backword"><i class="backsohu"></i>返回搜狐，查看更多</span></a></p>      <p data-role="editor-name">责任编辑：<span></span></p>\n'
        html = str(html).replace(re_string, '')
        html = re.sub('\d{6}', '000000', html)  # 更换邮编
        html = re.sub('\d{3}-\d{8}', '010-88888888', html)  # 更换电话号1
        html = re.sub('\d{8}', '88888888', html)  # 更换电话号2
        html = re.sub('www.xisuedu.com', 'www.baidu.com', html)  # 更换网址
        return html
    except:
        pass


def sohuSpider():
    pageNumber = 1
    while True:
        api = 'http://mp.sohu.com/apiV2/profile/newsListAjax?pageSize=10&xpt=%s&pageNumber=%d' % (xpt, pageNumber)
        try:
            response = requests.get(url=api, headers=header)
            # print(response.text)
            response_json = (response.text[1:-1]).replace('\\', '')
            response_dic = json.loads(response_json)
            data_list = response_dic['data']
            if len(data_list) == 0:
                break
            for data in data_list:
                title = parse.unquote(data['title'])  # 标题
                url = 'http:' + data['url']
                print(title, url)
                time.sleep(0.1)
                html = detailParse(url)
                with open('./txt/{}.txt'.format(title),'a+',encoding='utf-8') as f:
                    f.write('<title={}>'.format(title)+'\n'+'<neirong={}>'.format(html))
            pageNumber += 1
        except:
            pageNumber += 1
        time.sleep(0.5)


if __name__ == '__main__':
    sohuSpider()

